Analysis of New York State’s 2020-2021 probability-based sampling results

This markdown uses the spsurvey package developed by Michael Dumelle, Tom Kincaid, Tony Olsen, and Marc Weber at EPA to analyze the results of New York’s spatially-balanced probability-based survey in 2020-2021. The results include categorical condition extent estimates for statewide conditions, cumulative distribution functions for water quality parameters, and comparisons to the National Lakes Assessment (nationally and regionally), New Hampshire and to New York’s historical data (LMAS samples since 2010).

Load packages

start.time <- Sys.time()

library(tidyverse)
library(huxtable)
library(ggplot2)
library(lubridate)
library(egg)
library(gridExtra)
library(ggpattern)
library(ggmap)
library(maps)
library(spsurvey) #This code was written for spsurvey v5.0.0

Load data

To run cat_analysis(), the dataframe will need to include columns for the categorical variable you are interested in, weight for each site, xcoordinate and ycoordinate. Optionally, you can also include a subpopulation or stratum column to separate results and site IDs for two-stage analysis (for example, if there are repeated sites).

In this script, weight is adjusted because of the proportional design. In the future, this can be excluded as the design should include weights already.

I am designating relevant variables from the dataset as well as creating new variables: mean dissolved oxygen from 0-2m, epilimentic means for ph and spconductance, total nitrogen and DIN:TP ratio.

# retrieve raw data from database
setwd("~/OneDrive - New York State Office of Information Technology Services/Rscripts/Current")
source("new_database/Reading.LMAS.Data.R")
setwd("~/OneDrive - New York State Office of Information Technology Services/Rscripts/Probability.Sampling/")

rm(list=setdiff(ls(), c('newdata',"start.time")))
#pick out relevant variables
lab<-newdata %>%
  filter(SAMPLE_DATE>'2020-01-01') %>%
  mutate(combined=paste(CHARACTERISTIC_NAME,
                        INFORMATION_TYPE,
                        RSLT_RESULT_SAMPLE_FRACTION,
                        sep = "_"))  %>% #combining parameters into 1 name
  select(LAKE_HISTORY_ID,
         SAMPLE_DATE,
         combined,
         RSLT_RESULT_VALUE,
         RSLT_LABORATORY_QUALIFIER,
         RSLT_VALIDATOR_QUALIFIER,
         RSLT_PROFILE_DEPTH) %>%
  mutate(RSLT_RESULT_VALUE=ifelse(!is.na(RSLT_LABORATORY_QUALIFIER)&(RSLT_LABORATORY_QUALIFIER=="U"|RSLT_LABORATORY_QUALIFIER=="UE"),"0",RSLT_RESULT_VALUE),
         RSLT_RESULT_VALUE=as.numeric(RSLT_RESULT_VALUE)) %>%
  filter(!is.na(RSLT_RESULT_VALUE),
         is.na(RSLT_VALIDATOR_QUALIFIER)|(RSLT_VALIDATOR_QUALIFIER!="R"), #removing rejected samples
         combined %in% c('CHLOROPHYLL A_OW_TOTAL', #picking parameters
                         'PHOSPHORUS, TOTAL_OW_TOTAL',
                         "MICROCYSTIN_OW_NA",
                         "NITROGEN, NITRATE (AS N)_OW_TOTAL",
                         "NITROGEN, NITRATE-NITRITE_OW_TOTAL",
                         "NITROGEN, KJELDAHL, TOTAL_OW_TOTAL",
                         "NITROGEN, TOTAL_OW_TOTAL",
                         "DISSOLVED OXYGEN_DP_NA",
                         "TRUE COLOR_OW_TOTAL",
                         "PH_DP_NA",
                         "SPECIFIC CONDUCTANCE_DP_NA",
                         "TEMPERATURE_DP_NA",
                         "DEPTH, SECCHI DISK DEPTH_SD_NA",
                         "CALCIUM_OW_TOTAL",
                         "CHLORIDE_OW_TOTAL",
                         "CHLOROPHYLL A (PROBE) CONCENTRATION, CHLOROPHYTE (GREEN ALGAE)_OW_NA",
                         "CHLOROPHYLL A (PROBE) CONCENTRATION, CRYPTOPHYTA (CRYPTOPHYTES)_OW_NA",
                         "CHLOROPHYLL A (PROBE) CONCENTRATION, CYANOBACTERIA (BLUEGREEN)_OW_NA",
                         "CHLOROPHYLL A (PROBE) CONCENTRATION, DINOPHYTA (DIATOMS)_OW_NA",
                         "CHLOROPHYLL A (PROBE) CONCENTRATION, TOTAL_OW_NA",
                         "ALKALINITY, TOTAL (AS CACO3)_OW_TOTAL",
                         "ARSENIC_BS_TOTAL",
                         "ARSENIC_OW_TOTAL",
                         "IRON_BS_TOTAL",
                         "IRON_OW_TOTAL",
                         "MAGNESIUM_BS_TOTAL",
                         "MAGNESIUM_OW_TOTAL",
                         "CARBON, DISSOLVED ORGANIC (DOC)_OW_DISSOLVED",
                         "NITROGEN, AMMONIA (AS N)_OW_TOTAL")) %>%
  select(LAKE_HISTORY_ID,SAMPLE_DATE,combined,RSLT_RESULT_VALUE,RSLT_PROFILE_DEPTH) %>%
  distinct(LAKE_HISTORY_ID,SAMPLE_DATE,combined,RSLT_PROFILE_DEPTH,.keep_all = TRUE) %>%
  rename(LAKE_ID=LAKE_HISTORY_ID,
         chemical_name=combined,
         result_value=RSLT_RESULT_VALUE,
         profile_depth=RSLT_PROFILE_DEPTH)

# dissolved oxygen:
# keep DO values between 0 and 2m depth
lab2 <- lab %>% 
  mutate(keep=case_when(
    !(chemical_name=="DISSOLVED OXYGEN_DP_NA") ~ "no",
    chemical_name=="DISSOLVED OXYGEN_DP_NA" & profile_depth <= 2 ~ "yes",
    chemical_name=="DISSOLVED OXYGEN_DP_NA" & profile_depth > 2 ~ "no")) %>%
  filter(keep!="no") %>%
  select(-keep)

# mean DO for each lake each date
DO <- aggregate(lab2$result_value,list(lab2$LAKE_ID,lab2$SAMPLE_DATE),FUN=mean) %>% 
  mutate(chemical_name="DISSOLVED OXYGEN_epi") %>% 
  rename(LAKE_ID=Group.1,
         SAMPLE_DATE=Group.2,
         result_value=x)

#thermocline
thermocline<-lab %>% 
  filter(chemical_name=='TEMPERATURE_DP_NA') %>% 
  mutate(thermocline=NA)
LID<-thermocline$LAKE_ID[1]
date<-thermocline$SAMPLE_DATE[1]
depth<-thermocline$profile_depth[1]
temp<-thermocline$result_value[1]

for(i in seq(nrow(thermocline))){
  current<-thermocline[i,]
  if(current$LAKE_ID==LID&current$SAMPLE_DATE==date){
    depth=current$profile_depth
    if((temp-current$result_value)>1){
      thermocline$thermocline[i]<-current$profile_depth
    }
    temp=current$result_value
  }
  else{
    LID=current$LAKE_ID
    date=current$SAMPLE_DATE
    depth=current$profile_depth
    temp=current$result_value
  }
}

thermocline<-thermocline %>%  #pull the lowest value for all
  group_by(LAKE_ID,SAMPLE_DATE) %>% 
  mutate(thermocline=min(thermocline, na.rm=T)) %>% 
  ungroup() %>% 
  mutate(thermocline=ifelse(thermocline==Inf,NA,thermocline)) %>% 
  select(LAKE_ID,SAMPLE_DATE,thermocline,profile_depth) %>% 
  filter(!is.na(thermocline)) %>%   #remove those without a thermocline
  distinct()

#epilimentic means for ph and spconductance
epi<-lab %>% 
  inner_join(thermocline,by=c('LAKE_ID','SAMPLE_DATE','profile_depth')) %>% 
  filter(chemical_name %in% c("PH_DP_NA","SPECIFIC CONDUCTANCE_DP_NA")) %>% 
  mutate(result_value = ifelse(chemical_name=="PH_DP_NA",(10^result_value),result_value)) %>%  #convert ph to [H+]
  distinct() %>% 
  arrange(LAKE_ID,SAMPLE_DATE,chemical_name,profile_depth) 

epi<-epi %>% 
  filter(profile_depth<=thermocline) %>% #shallower than thermocline
  distinct() %>% 
  group_by(LAKE_ID,SAMPLE_DATE,chemical_name) %>% 
  summarize(Mean=mean(result_value,na.rm=TRUE), #creating epilimnetic mean
            n=n()) %>% 
  filter(n>2) %>% 
  select(LAKE_ID,SAMPLE_DATE,chemical_name,Mean)

epi<-epi %>% 
  mutate(Mean=ifelse(chemical_name=="PH_DP_NA",log10(Mean),Mean)) %>% #[H+] back to pH
  rename(result_value=Mean) %>% 
  mutate(chemical_name=case_when(
    chemical_name=="PH_DP_NA"~"PH_epi",
    chemical_name=="SPECIFIC CONDUCTANCE_DP_NA"~"SPECIFIC CONDUCTANCE_epi"
  ))

#hypolimnetic means for ph and dissolved oxygen, as above
hypo<-lab %>% 
  inner_join(thermocline,by=c('LAKE_ID','SAMPLE_DATE','profile_depth')) %>% 
  filter(chemical_name %in% c("PH_DP_NA","DISSOLVED OXYGEN_DP_NA")) %>% 
  mutate(result_value = ifelse(chemical_name=="PH_DP_NA",(10^result_value),result_value)) %>% 
  distinct() %>% 
  arrange(LAKE_ID,SAMPLE_DATE,chemical_name,profile_depth) 

hypo<-hypo %>% 
  filter(profile_depth>=thermocline) %>%  #deeper than thermocline
  distinct() %>% 
  group_by(LAKE_ID,SAMPLE_DATE,chemical_name) %>% 
  summarize(Mean=mean(result_value,na.rm=TRUE),
            n=n()) %>% 
  filter(n>2) %>% 
  select(LAKE_ID,SAMPLE_DATE,chemical_name,Mean)

hypo<-hypo %>% 
  mutate(Mean=ifelse(chemical_name=="PH_DP_NA",log10(Mean),Mean)) %>% 
  rename(result_value=Mean) %>% 
  mutate(chemical_name=case_when(
    chemical_name=="PH_DP_NA"~"PH_hypo",
    chemical_name=="DISSOLVED OXYGEN_DP_NA"~"DISSOLVED_OXYGEN_mg_L"
  ))

# read in site data
sites<-read.csv("Probability_Based_Sites_2020_2021.csv")
sites<-sites %>% 
  rename(siteID=SITE_ID,
         xcoord=LON_DD83,
         ycoord=LAT_DD83,
         Eval_Status=EvalStatus) %>% 
  filter(Eval_Status!="") %>%    #removing sites that we haven't yet evaluated
  mutate(Eval_Status=trimws(Eval_Status))

# restrict to only the data in the probability study and spread the data
att<-merge(lab,epi,by=c("LAKE_ID","SAMPLE_DATE","chemical_name"),all=TRUE) %>%  #merging in epi means for pH and spcond
  mutate(result_value=ifelse(is.na(result_value.x),result_value.y,result_value.x)) %>% 
  select(-result_value.x,-result_value.y)
att<-merge(att,hypo,by=c("LAKE_ID","SAMPLE_DATE","chemical_name"),all=TRUE) %>%  #merging in hypo means for pH and DO
  mutate(result_value=ifelse(is.na(result_value.x),result_value.y,result_value.x)) %>% 
  select(-result_value.x,-result_value.y)
att<-merge(att,DO,by=c("LAKE_ID","SAMPLE_DATE","chemical_name"),all=TRUE)%>% #merging in epi DO (epa method)
  mutate(result_value=ifelse(is.na(result_value.x),result_value.y,result_value.x)) %>% 
  filter(!chemical_name %in% c("DISSOLVED OXYGEN_DP_NA","PH_DP_NA","SPECIFIC CONDUCTANCE_DP_NA","TEMPERATURE_DP_NA")) %>% 
  select(-result_value.x,-result_value.y)
att<-merge(att,sites,by=c('LAKE_ID'),all.y = TRUE) %>% distinct() #filtering by site list
att<-att %>% spread(chemical_name,result_value,fill = NA)

att<-att %>% filter((Eval_Status=="Target_Sampled"&!is.na(`CHLOROPHYLL A_OW_TOTAL`)&!is.na(`PHOSPHORUS, TOTAL_OW_TOTAL`)) | #keep samples with valid results in both chl and tp if sampled
                      (Eval_Status!="Target_Sampled") | #keep unsampled
                      (LAKE_ID %in% c("0703UWBXXX1","0801KAY0984A","1203MET0821","0602LUD0099","0801GUL0969"))) %>% #special lakes that would get otherwise deleted because they do not have valid data but were sampled and only have 1 sampling event
  rename("CHLOROPHYLL_ug_L"=`CHLOROPHYLL A_OW_TOTAL`, #rename without spaces for cont_analysis later
         "PHOSPHORUS_mg_L"=`PHOSPHORUS, TOTAL_OW_TOTAL`,
         "ALKALINITY_mg_L"=`ALKALINITY, TOTAL (AS CACO3)_OW_TOTAL`,
         "CHLORIDE_mg_L"=CHLORIDE_OW_TOTAL,
         "CALCIUM_mg_L"=CALCIUM_OW_TOTAL,
         "DOC_mg_L"=`CARBON, DISSOLVED ORGANIC (DOC)_OW_DISSOLVED`,
         "SECCHI_m"=`DEPTH, SECCHI DISK DEPTH_SD_NA`,
         "TRUE_COLOR"=`TRUE COLOR_OW_TOTAL`,
         "SPEC_CONDUCTANCE"=`SPECIFIC CONDUCTANCE_epi`,
         "AMMONIA_N"=`NITROGEN, AMMONIA (AS N)_OW_TOTAL`)

#creating thresholds
att<-att %>% 
  # remove fields in the sites table that aren't relevant
  select(-Accessible,-Comments,-Contact,-STATUS) %>%
  # create total nitrogen
  mutate(NITROGEN_mg_L=case_when(
    !is.na(`NITROGEN, KJELDAHL, TOTAL_OW_TOTAL`) ~
      (`NITROGEN, NITRATE-NITRITE_OW_TOTAL`+`NITROGEN, KJELDAHL, TOTAL_OW_TOTAL`),
    is.na(`NITROGEN, KJELDAHL, TOTAL_OW_TOTAL`) ~ `NITROGEN, TOTAL_OW_TOTAL`)) %>%
  # DIN:TP
  mutate(NP_ratio=`NITROGEN, NITRATE (AS N)_OW_TOTAL`/PHOSPHORUS_mg_L) %>% 
  # trophic status
  mutate(phos_trophic=case_when(
    PHOSPHORUS_mg_L<=0.01 ~ "Oligotrophic",
    between(PHOSPHORUS_mg_L,0.01,0.02) ~ "Mesotrophic",
    PHOSPHORUS_mg_L>=0.02 ~ "Eutrophic")) %>%
  mutate(chla_trophic=case_when(
    CHLOROPHYLL_ug_L<=2 ~ "Oligotrophic",
    between(CHLOROPHYLL_ug_L,2,8) ~ "Mesotrophic",
    CHLOROPHYLL_ug_L>=8 ~ "Eutrophic")) %>%
  # EPA thresholds
  mutate(TP_threshold=case_when(
    PHOSPHORUS_mg_L<=0.016 ~ "Good",
    between(PHOSPHORUS_mg_L, 0.016, 0.0279) ~ "Fair",
    PHOSPHORUS_mg_L>=0.0279 ~ "Poor")) %>%
  mutate(TN_threshold=case_when(
    NITROGEN_mg_L<=0.428 ~ "Good",
    between(NITROGEN_mg_L, 0.428, 0.655) ~ "Fair",
    NITROGEN_mg_L>=0.655 ~ "Poor")) %>%
  mutate(CHLA_threshold=case_when(
    CHLOROPHYLL_ug_L<=4.52 ~ "Good",
    between(CHLOROPHYLL_ug_L, 4.52, 8.43) ~ "Fair",
    CHLOROPHYLL_ug_L>=8.43 ~ "Poor")) %>%
  # microcystin
  mutate(microcystin=case_when(
    is.na(`MICROCYSTIN_OW_NA`) ~"Non-detect",
    `MICROCYSTIN_OW_NA`<8 ~ "Microcystin Detected",
    `MICROCYSTIN_OW_NA`>=8 ~ "Most disturbed")) %>%
  # dissolved oxygen
  mutate(d.oxygen=case_when(
    `DISSOLVED OXYGEN_epi`<=3 ~ "Poor",
    between(`DISSOLVED OXYGEN_epi`, 3, 5) ~ "Fair",
    `DISSOLVED OXYGEN_epi`>=5 ~ "Good")) %>% 
  #nutrient limitation
  mutate(N_LIMIT=case_when(
    NITROGEN_mg_L/PHOSPHORUS_mg_L<=10 ~ "N-limited",
    between(NITROGEN_mg_L/PHOSPHORUS_mg_L, 10, 20) ~ "Co-limited",
    NITROGEN_mg_L/PHOSPHORUS_mg_L>=20 ~ "P-limited")) %>% 
  #secchi
  mutate(secchi=case_when(
    SECCHI_m<=2 ~ "Eutrophic",
    between(SECCHI_m, 2, 5) ~ "Mesotrophic",
    SECCHI_m>=5 ~ "Oligotrophic")) %>% 
  #zebra mussels
  mutate(zebra=case_when(
    CALCIUM_mg_L<=10 ~ "Not susceptible",
    between(CALCIUM_mg_L, 10, 20) ~ "May be susceptible",
    CALCIUM_mg_L>=20 ~ "Highly susceptible")) %>% 
  #cslap
  mutate(conductance=case_when(
    SPEC_CONDUCTANCE<=125 ~ "Soft",
    between(SPEC_CONDUCTANCE, 125, 250) ~ "Average",
    SPEC_CONDUCTANCE>=250 ~ "Hard")) %>%
  mutate(color=case_when(
    TRUE_COLOR<=10 ~ "Uncolored",
    between(TRUE_COLOR, 10, 30) ~ "Weak",
    TRUE_COLOR>=30 ~ "High")) %>% 
  mutate(ph=case_when(
    `PH_epi`<=6.5 ~ "Acidic",
    between(`PH_epi`, 6.5, 7.5) ~ "~Neutral",
    between(`PH_epi`, 7.5, 8.5) ~ "Slightly alk",
    `PH_epi`>=8.5 ~ "Highly alk")) %>% 
  #leech
  mutate(leech=case_when(
    PHOSPHORUS_mg_L<=0.030 & TRUE_COLOR<=20 ~ "Blue",
    PHOSPHORUS_mg_L>0.030 & TRUE_COLOR<=20 ~ "Green",
    PHOSPHORUS_mg_L<=0.030 & TRUE_COLOR>20 ~ "Brown",
    PHOSPHORUS_mg_L>0.030 & TRUE_COLOR>20 ~ "Murky"
  )) %>% 
  #chloride
  mutate(chloride=case_when(
    `CHLORIDE_mg_L` <= 35 ~ "Low",
    between(`CHLORIDE_mg_L`,35,250) ~ "Medium",
    `CHLORIDE_mg_L` >= 250 ~ "High",
    Eval_Status=="Target_Sampled" & is.na(`CHLORIDE_mg_L`) ~ "Low")) %>% 
  #algal composition
  rename(CHLOROPHYTE=`CHLOROPHYLL A (PROBE) CONCENTRATION, CHLOROPHYTE (GREEN ALGAE)_OW_NA`,
         CRYPTOPHYTA=`CHLOROPHYLL A (PROBE) CONCENTRATION, CRYPTOPHYTA (CRYPTOPHYTES)_OW_NA`,
         CYANOBACTERIA=`CHLOROPHYLL A (PROBE) CONCENTRATION, CYANOBACTERIA (BLUEGREEN)_OW_NA`,
         DINOPHYTA=`CHLOROPHYLL A (PROBE) CONCENTRATION, DINOPHYTA (DIATOMS)_OW_NA`,
         PROBE_TOTAL=`CHLOROPHYLL A (PROBE) CONCENTRATION, TOTAL_OW_NA`) %>% 
  mutate(chlorophyte=case_when(
    CHLOROPHYTE/PROBE_TOTAL<=0.25 ~ "Low",
    between(CHLOROPHYTE/PROBE_TOTAL,0.25,0.5) ~ "Medium",
    CHLOROPHYTE/PROBE_TOTAL>=0.5 ~ "High")) %>% 
  mutate(cryptophyta=case_when(
    CRYPTOPHYTA/PROBE_TOTAL<=0.25 ~ "Low",
    between(CRYPTOPHYTA/PROBE_TOTAL,0.25,0.5) ~ "Medium",
    CRYPTOPHYTA/PROBE_TOTAL>=0.5 ~ "High")) %>% 
  mutate(cyanobacteria=case_when(
    CYANOBACTERIA/PROBE_TOTAL<=0.25 ~ "Low",
    between(CYANOBACTERIA/PROBE_TOTAL,0.25,0.5) ~ "Medium",
    CYANOBACTERIA/PROBE_TOTAL>=0.5 ~ "High")) %>% 
  mutate(dinophyta=case_when(
    DINOPHYTA/PROBE_TOTAL<=0.25 ~ "Low",
    between(DINOPHYTA/PROBE_TOTAL,0.25,0.5) ~ "Medium",
    DINOPHYTA/PROBE_TOTAL>=0.5 ~ "High")) %>% 
  #alkalinity
  mutate(alkalinity=case_when(
    ALKALINITY_mg_L <= 60 ~ "Soft",
    between(ALKALINITY_mg_L,60,120) ~ "Moderately hard",
    between(ALKALINITY_mg_L,120,180) ~ "Hard",
    ALKALINITY_mg_L >= 180 ~ "Very hard"))

# Create a Target/NonTarget status variable
att<-att %>% 
  mutate(statusTNT="Target",
         statusTNT=ifelse(Eval_Status=="NonTarget","NonTarget",statusTNT))

# remove duplicates (multiple samples of same lake)
att <- att %>%
  mutate(SAMPLE_DATE = ymd(SAMPLE_DATE)) %>% # convert to date
  mutate(month = month(SAMPLE_DATE)) %>% # extract month from date
  group_by(siteID) %>%
  mutate(has_aug = 8 %in% month) %>% # annotate as having date in august
  filter(has_aug & month == 8 |
           !has_aug) %>% # filter only august dates from sites with or all from those without
  slice_max(n = 1,
            order_by = SAMPLE_DATE,
            with_ties = F) %>% # change with_ties to TRUE to discard NA values
  select(-has_aug)

att<-att %>%  #creating adjusted weights
  ungroup() %>% 
  mutate(WgtAdj=case_when(
    PROB_CAT=="(1,4]"~(1828/29),
    PROB_CAT=="(4,10]"~(2490/15),
    PROB_CAT=="(10,20]"~(1003/18),
    PROB_CAT=="(20,50]"~(616/20),
    PROB_CAT==">50"~(500/33)))


#read in data on excursions
excursions <- read.csv("probability.data.excursions.csv")
excursions <- excursions %>% 
  select(parameter,n_exceedances,LAKE_ID) %>% 
  spread(parameter,n_exceedances,fill = 0) %>% 
  mutate(LAKE_ID=toupper(LAKE_ID)) %>% 
  mutate(all_params=case_when(
    ammonia==0 & chlorophyll_a==0 & dissolved_oxygen==0 & nitrite==0 & ph==0 & phosphorus==0 ~0,
    LAKE_ID=TRUE~1 #create fully supporting variable
  )) %>% 
  rename(AMMONIA_excursions=ammonia,
         DO_excursions=dissolved_oxygen,
         NITRITE_excursions=nitrite,
         PH_excursions=ph,
         PHOSPHORUS_excursions=phosphorus,
         Fully_support=all_params)

att<-merge(att,excursions,by="LAKE_ID") #merging in fishing use excursions

Additionally, I will load in NLA, NAP and NH data:

# read in NLA data
setwd("~/OneDrive - New York State Office of Information Technology Services/Rscripts/Probability.Sampling/")

chem <- read.csv("nla_2017_water_chemistry_chla-data.csv") #read in chemistry data

chem <- chem %>% 
  filter(ANALYTE %in% c("NTL","CHLA","PTL","NITRATE_N","COLOR","CHLORIDE","MAGNESIUM","DOC","COND","AMMONIA_N","CALCIUM","PH"),
         NARS_FLAG == "") %>% 
  select(UID,SITE_ID,DATE_COL,ANALYTE,RESULT) %>% 
  pivot_wider(names_from= ANALYTE ,values_from = RESULT,) %>% 
  rename(TN=NTL,TP=PTL)

secchi <- read.csv("nla_2017_secchi-data.csv") %>% #read in secchi data
  mutate(SECCHI=(DISAPPEARS+REAPPEARS)/2, #average the two depths
         DATE_COL=mdy(DATE_COL)) %>% 
  select(UID,SITE_ID,DATE_COL,SECCHI)

oxygen <- read.csv("nla_2017_profile-data.csv") #read in dissolved oxygen data

oxygen <- oxygen %>% 
  select(UID,SITE_ID,DATE_COL,DEPTH,OXYGEN) %>% 
  # dissolved oxygen:
  # keep DO values between 0 and 2m depth
  mutate(keep=case_when(
    DEPTH <= 2 ~ "yes",
    DEPTH > 2 ~ "no")) %>%
  filter(keep=="yes") %>%
  select(-keep)

# mean DO for each lake of top 2m
DO <- aggregate(oxygen$OXYGEN,list(oxygen$SITE_ID,oxygen$DATE_COL),FUN=mean)
# add back in
oxygen <- merge(oxygen,DO,by.x=c("SITE_ID","DATE_COL"),by.y=c("Group.1","Group.2"),all.x=TRUE)
oxygen$x <- as.numeric(oxygen$x)
oxygen <- oxygen %>% 
  select(-c(DEPTH,OXYGEN)) %>%
  rename(OXYGEN=x) %>% 
  distinct()

att.nla <- merge(chem,oxygen,by=c("UID","SITE_ID","DATE_COL"),all.y=TRUE,all.x=TRUE) %>% #merging chemistry and oxygen data
  mutate(DATE_COL=dmy(DATE_COL))
att.nla <- merge(att.nla,secchi,by=c("UID","SITE_ID","DATE_COL"),all.x=TRUE) #merging in secchi data
att.nla[,4:17] <- lapply(att.nla[,4:17],as.numeric) #convert all columns to numeric
att.nla$NP_ratio <- as.numeric(att.nla$NITRATE_N)/(as.numeric(att.nla$TP)/1000) #create n:p ratio
att.nla <- att.nla %>% #rename to match above
  rename(CHLOROPHYLL_ug_L=CHLA,
         NITROGEN_mg_L=TN,
         PHOSPHORUS_ug_L=TP,
         CHLORIDE_mg_L=CHLORIDE,
         DOC_mg_L=DOC,
         CALCIUM_mg_L=CALCIUM,
         TRUE_COLOR=COLOR,
         SPEC_CONDUCTANCE=COND,
         MAGNESIUM_OW_TOTAL=MAGNESIUM,
         SECCHI_m=SECCHI,
         NP_ratio=NP_ratio)

# selecting parameters and adding trophic status

att.nla<-att.nla %>% 
  # trophic status
  mutate(phos_trophic=case_when(
    PHOSPHORUS_ug_L<=10 ~ "oligotrophic",
    between(PHOSPHORUS_ug_L,10,20) ~ "mesotrophic",
    PHOSPHORUS_ug_L>=20 ~ "eutrophic")) %>%
  mutate(chla_trophic=case_when(
    CHLOROPHYLL_ug_L<=2 ~ "oligotrophic",
    between(CHLOROPHYLL_ug_L,2,8) ~ "mesotrophic",
    CHLOROPHYLL_ug_L>=8 ~ "eutrophic")) %>%
  # EPA thresholds
  mutate(TP_threshold=case_when(
    PHOSPHORUS_ug_L<=16 ~ "Good",
    between(PHOSPHORUS_ug_L, 16, 27.9) ~ "Fair",
    PHOSPHORUS_ug_L>=27.9 ~ "Poor")) %>%
  mutate(TN_threshold=case_when(
    NITROGEN_mg_L<=0.428 ~ "Good",
    between(NITROGEN_mg_L, 0.428, 0.655) ~ "Fair",
    NITROGEN_mg_L>=0.655 ~ "Poor")) %>%
  mutate(CHLA_threshold=case_when(
    CHLOROPHYLL_ug_L<=4.52 ~ "Good",
    between(CHLOROPHYLL_ug_L, 4.52, 8.43) ~ "Fair",
    CHLOROPHYLL_ug_L>=8.43 ~ "Poor")) %>%
  # dissolved oxygen
  mutate(d.oxygen=case_when(
    OXYGEN<=3 ~ "Poor",
    between(OXYGEN, 3, 5) ~ "Fair",
    OXYGEN>=5 ~ "Good")) %>% 
  #leech
  mutate(leech=case_when(
    PHOSPHORUS_ug_L<=30 & TRUE_COLOR<=20 ~ "Blue",
    PHOSPHORUS_ug_L>30 & TRUE_COLOR<=20 ~ "Green",
    PHOSPHORUS_ug_L<=30 & TRUE_COLOR>20 ~ "Brown",
    PHOSPHORUS_ug_L>30 & TRUE_COLOR>20 ~ "Murky"
  )) %>% 
  #chloride
  mutate(chloride=case_when(
    CHLORIDE_mg_L <= 35 ~ "Low",
    between(CHLORIDE_mg_L,35,250) ~ "Medium",
    CHLORIDE_mg_L >= 250 ~ "High"))

sites <- read.csv("nla_2017_site_information-data.csv") #read in site info for filtering

nap.sites <- sites %>% 
  mutate(include=case_when(
    AG_ECO9=="NAP" & AREA_HA>2.63 ~ "yes", #creating subcategory to keep sites in northern appalachian ecoregion with area >6.4 acres
    SITE_ID=TRUE ~ "no"
  )) %>% 
  select(UID,SITE_ID,AREA_CAT6,EVAL_CAT,YCOORD,XCOORD,WGT_TP_EXTENT,include) %>% 
  filter(WGT_TP_EXTENT != 0)

nla.sites <- sites %>% 
  mutate(include=case_when(
    AREA_HA>2.63 ~ "yes", #creating subcategory to keep sites with area >6.4 acres
    SITE_ID=TRUE ~ "no"
  )) %>% 
  select(UID,SITE_ID,AREA_CAT6,EVAL_CAT,YCOORD,XCOORD,WGT_TP_EXTENT,include) %>% 
  filter(WGT_TP_EXTENT != 0)

nla.att <- merge(att.nla,nla.sites,by=c("UID","SITE_ID"))
nap.att <- merge(att.nla,nap.sites,by=c("UID","SITE_ID"))
nh.att <- read.csv("New_Hampshire_Lakes_2017_Design_Status_20211027_KH_EPA.csv", na.strings=c(""," ","NA")) #read in site data

nh.att<-nh.att %>% 
  mutate(include=case_when(
    AREA_HA>2.63 ~ "yes", #create subcategory for keeping lakes larger than 6.4 acres
    SITE_ID=TRUE ~ "no"))

chem <- read.csv("BasicNHChem_ForNY.csv") #read in chem data

#merge by site
nh.att <- merge(nh.att,chem,by.x="SITE_ID",by.y="NLA.ID",all.x=T) %>% 
  mutate(CHLORIDE=case_when(
    CHLORIDE!="<3" ~ as.numeric(CHLORIDE), #method detection limit for chloride is 3, convert smaller values to 0
    CHLORIDE=="<3" ~ 0)) %>% 
  mutate(CHLORIDE_threshold=case_when(
    CHLORIDE<=35 ~ "Low",
    between(CHLORIDE,35,250) ~ "Medium",
    CHLORIDE>=250 ~ "High"
  )) %>% 
  mutate(TN_UG.L=TN_UG.L/1000) %>% 
  rename(CHLOROPHYLL_ug_L=CHLOROPHYLL.A.x,
         NITROGEN_mg_L=TN_UG.L,
         PHOSPHORUS_ug_L=PHOSPHORUS.ug.L.x,
         CHLORIDE_mg_L=CHLORIDE)

Categorical Variable Analysis

Here I perform the analysis using cat_analysis(). The relevant categorical variable / threshold should be input as the vars argument. vars can be a list of all categories you are interested in, which would create a very big dataframe.

In the future, the weight argument should be part of the site design and might not be called WgtAdj.

#list variables you are interested in and defined above
vars <- c("phos_trophic", "chla_trophic", "TP_threshold", "TN_threshold", "CHLA_threshold", "microcystin", "d.oxygen", "N_LIMIT", "secchi", "zebra", "conductance", "color", "ph", "leech", "chloride","alkalinity","chlorophyte","cryptophyta","cyanobacteria","dinophyta")

#analysis
CatExtent <- cat_analysis(
  dframe=att,
  vars=vars, 
  subpops = ,
  siteID = "siteID",
  weight = "WgtAdj", #name will probably change in future
  xcoord = "xcoord",
  ycoord = "ycoord")

table <- CatExtent %>% 
  select(Indicator,
         Category,
         Estimate.P,
         LCB95Pct.P,
         UCB95Pct.P) %>% 
  filter(Category!="Total") %>% #remove totals
  mutate(Category=factor(Category, levels=c("Poor","Fair","Good", #factor for plotting
                                            "Blue","Green","Brown","Murky",
                                            "High","Weak","Uncolored",
                                            "Medium","Low",
                                            "Acidic","~Neutral","Slightly alk","Highly alk",
                                            "Soft","Moderately hard","Average","Hard","Very hard",
                                            "Highly susceptible","May be susceptible","Not susceptible",
                                            "Eutrophic","Mesotrophic","Oligotrophic",
                                            "N-limited","Co-limited","P-limited",
                                            "Non-detect","Microcystin detected","Most disturbed",
                                            "Excursion","Non-excursion")),
         Indicator=case_when( #rename for plotting
           Indicator=="phos_trophic"~"Phosphorus", 
           Indicator=="chla_trophic"~"Chlorophyll-a", 
           Indicator=="TP_threshold"~"Total phosphorus", 
           Indicator=="TN_threshold"~"Total nitrogen", 
           Indicator=="CHLA_threshold"~"Total chlorophyll", 
           Indicator=="microcystin"~"Microcystin",
           Indicator=="d.oxygen"~"Dissolved oxygen",
           Indicator=="N_LIMIT"~"Nutrient limitation", 
           Indicator=="secchi"~"Secchi", 
           Indicator=="zebra"~"Zebra mussel susceptibility", 
           Indicator=="conductance"~"Hardness", 
           Indicator=="color"~"Color", 
           Indicator=="ph"~"pH", 
           Indicator=="leech"~"Nutrient-color status", 
           Indicator=="chloride"~"Chloride",
           Indicator=TRUE~Indicator)
  )

hux <- as_hux(table) #I use huxtable because kable doesn't work on my computer but that is also fine 
number_format(hux) <- 2
theme_plain(hux)
IndicatorCategoryEstimate.PLCB95.00Pct.PUCB95.00Pct.P
PhosphorusEutrophic22.3511.2533.45
PhosphorusMesotrophic34.3318.3550.31
PhosphorusOligotrophic43.3228.9257.72
Chlorophyll-aEutrophic11.434.9717.89
Chlorophyll-aMesotrophic69.3356.1282.53
Chlorophyll-aOligotrophic19.247.0831.41
Total phosphorusFair18.316.5330.09
Total phosphorusGood64.8951.8577.92
Total phosphorusPoor16.816.0627.56
Total nitrogenFair14.265.6622.86
Total nitrogenGood66.7952.5681.02
Total nitrogenPoor18.955.8832.02
Total chlorophyllFair34.0917.7650.41
Total chlorophyllGood54.4938.3370.64
Total chlorophyllPoor11.434.9717.89
MicrocystinNon-detect100.00100.00100.00
Dissolved oxygenFair1.280.003.46
Dissolved oxygenGood91.1680.11100.00
Dissolved oxygenPoor7.550.0018.47
Nutrient limitationCo-limited17.323.5131.12
Nutrient limitationN-limited0.780.002.13
Nutrient limitationP-limited81.9068.0195.80
SecchiEutrophic46.3731.3761.37
SecchiMesotrophic48.6933.5063.88
SecchiOligotrophic4.940.809.09
Zebra mussel susceptibilityHighly susceptible29.7212.5446.90
Zebra mussel susceptibilityMay be susceptible14.870.0031.71
Zebra mussel susceptibilityNot susceptible55.4134.8076.02
HardnessAverage16.412.3830.45
HardnessHard23.197.9138.48
HardnessSoft60.3944.3776.42
ColorHigh68.6452.2085.07
ColorUncolored8.280.1416.41
ColorWeak23.0910.2535.92
pH~Neutral47.3430.0364.65
pHAcidic15.264.4226.10
pHHighly alk15.281.4929.08
pHSlightly alk22.1210.2833.95
Nutrient-color statusBlue21.597.1536.04
Nutrient-color statusBrown54.5034.9474.06
Nutrient-color statusGreen1.080.003.07
Nutrient-color statusMurky22.836.7638.90
ChlorideHigh6.600.0016.90
ChlorideLow77.1964.3790.01
ChlorideMedium16.215.6026.81
alkalinityHard11.470.1822.75
alkalinityModerately hard20.637.2434.03
alkalinitySoft67.9054.4381.37
chlorophyteHigh75.4558.6092.30
chlorophyteLow13.420.0028.43
chlorophyteMedium11.131.6020.66
cryptophytaLow100.00100.00100.00
cyanobacteriaHigh16.190.3232.07
cyanobacteriaLow65.9347.8284.03
cyanobacteriaMedium17.886.7529.00
dinophytaHigh2.110.005.90
dinophytaLow87.8478.3897.29
dinophytaMedium10.061.5118.61
ny.cat <- table %>% filter(Indicator %in% c("Total phosphorus","Total nitrogen","Total chlorophyll","Dissolved oxygen","Chloride")) %>% mutate(Study="NY") #prep for NLA/NH/NAP comparison plots later

probability.cat <- table %>% filter(Indicator %in% c("Phosphorus","Chlorophyll-a","Secchi")) #prep for historical bias plot later

Continuous Variable Analysis

Here I perform the analysis using cont_analysis(). The relevant CONTINUOUS variable should be input as the vars argument. vars can be a list of all categories you are interested in, which would create a very big dataframe with all. The output of this function is a list with 3 estimations: cumulative distribution function (CDF), percentiles, and means.

As above, in the future the weight argument should be part of the site design and might not be called WgtAdj.

#To conduct analysis on a continuous variable, using a new list of CONTINUOUS variables, also note that cont_analysis() doesn't like variable names that have spaces so you will need to rename these. It's helpful to put the units in the name.

myvars <- c("CHLOROPHYLL_ug_L","NITROGEN_mg_L","PHOSPHORUS_mg_L","CHLORIDE_mg_L","DISSOLVED_OXYGEN_mg_L","DOC_mg_L","CALCIUM_mg_L","SECCHI_m","TRUE_COLOR","SPEC_CONDUCTANCE","AMMONIA_N","MAGNESIUM_OW_TOTAL","PH_hypo","ARSENIC_BS_TOTAL", "ARSENIC_OW_TOTAL", "IRON_BS_TOTAL", "IRON_OW_TOTAL",  "MAGNESIUM_BS_TOTAL", "MAGNESIUM_OW_TOTAL","ALKALINITY_mg_L","NP_ratio")

#Creates 3 estimations in list: CDF, percentiles, means.
analysis <- cont_analysis(
  dframe = att,
  vars = myvars,
  subpops = ,
  siteID = "siteID",
  weight = "WgtAdj",
  xcoord = "xcoord",
  ycoord = "ycoord")

NY<-analysis$CDF %>% #prep for comparison with NLA/NAP/NH plots later
  select(Indicator,Value,Estimate.P,LCB95Pct.P,UCB95Pct.P) %>% 
  mutate(Study="NY") %>% 
  mutate(Value=case_when(
    Indicator=="PHOSPHORUS_mg_L"~as.numeric(Value)*1000,
    Indicator=TRUE~as.numeric(Value)
  )) %>% 
  mutate(Indicator=case_when(
    Indicator=="PHOSPHORUS_mg_L"~"PHOSPHORUS_ug_L",
    Indicator=TRUE~Indicator
  )) 

str(analysis)
## List of 3
##  $ CDF :'data.frame':    659 obs. of  15 variables:
##   ..$ Type           : chr [1:659] "All_Sites" "All_Sites" "All_Sites" "All_Sites" ...
##   ..$ Subpopulation  : chr [1:659] "All Sites" "All Sites" "All Sites" "All Sites" ...
##   ..$ Indicator      : chr [1:659] "CHLOROPHYLL_ug_L" "CHLOROPHYLL_ug_L" "CHLOROPHYLL_ug_L" "CHLOROPHYLL_ug_L" ...
##   ..$ Value          : num [1:659] 1.00e-10 4.45e-01 5.50e-01 6.84e-01 1.05 ...
##   ..$ nResp          : num [1:659] 1 2 3 4 5 6 7 8 9 10 ...
##   ..$ Estimate.P     : num [1:659] 1.38 2.77 3.45 4.13 4.81 ...
##   ..$ StdError.P     : num [1:659] 1.22 1.7 1.82 1.95 2 ...
##   ..$ MarginofError.P: num [1:659] 2.39 3.32 3.56 3.81 3.93 ...
##   ..$ LCB95Pct.P     : num [1:659] 0 0 0 0.318 0.886 ...
##   ..$ UCB95Pct.P     : num [1:659] 3.77 6.09 7.01 7.95 8.74 ...
##   ..$ Estimate.U     : num [1:659] 30.8 61.6 76.8 91.9 107.1 ...
##   ..$ StdError.U     : num [1:659] 27 37.4 39.6 41.9 42.6 ...
##   ..$ MarginofError.U: num [1:659] 52.9 73.4 77.5 82.1 83.5 ...
##   ..$ LCB95Pct.U     : num [1:659] 0 0 0 9.76 23.54 ...
##   ..$ UCB95Pct.U     : num [1:659] 83.7 135 154.3 174 190.6 ...
##  $ Pct :'data.frame':    147 obs. of  10 variables:
##   ..$ Type         : chr [1:147] "All_Sites" "All_Sites" "All_Sites" "All_Sites" ...
##   ..$ Subpopulation: chr [1:147] "All Sites" "All Sites" "All Sites" "All Sites" ...
##   ..$ Indicator    : chr [1:147] "CHLOROPHYLL_ug_L" "CHLOROPHYLL_ug_L" "CHLOROPHYLL_ug_L" "CHLOROPHYLL_ug_L" ...
##   ..$ Statistic    : chr [1:147] "5Pct" "10Pct" "25Pct" "50Pct" ...
##   ..$ nResp        : num [1:147] 5 9 15 26 33 45 48 4 4 10 ...
##   ..$ Estimate     : num [1:147] 1.08 1.08 1.08 1.08 1.08 ...
##   ..$ StdError     : num [1:147] 0.411 0.357 0.433 0.586 1.209 ...
##   ..$ MarginofError: num [1:147] 0.805 0.7 0.848 1.148 2.369 ...
##   ..$ LCB95Pct     : num [1:147] 0 0.513 1.564 2.542 4.584 ...
##   ..$ UCB95Pct     : num [1:147] 1.65 1.94 3.3 4.89 9.43 ...
##  $ Mean:'data.frame':    21 obs. of  9 variables:
##   ..$ Type         : chr [1:21] "All_Sites" "All_Sites" "All_Sites" "All_Sites" ...
##   ..$ Subpopulation: chr [1:21] "All Sites" "All Sites" "All Sites" "All Sites" ...
##   ..$ Indicator    : chr [1:21] "CHLOROPHYLL_ug_L" "NITROGEN_mg_L" "PHOSPHORUS_mg_L" "CHLORIDE_mg_L" ...
##   ..$ nResp        : int [1:21] 55 47 60 55 33 41 33 59 35 38 ...
##   ..$ Estimate     : num [1:21] 5.0524 0.5559 0.0186 44.8451 4.8785 ...
##   ..$ StdError     : num [1:21] 0.45911 0.08874 0.00235 18.67918 0.74625 ...
##   ..$ MarginofError: num [1:21] 0.8998 0.1739 0.0046 36.6105 1.4626 ...
##   ..$ LCB95Pct     : num [1:21] 4.153 0.382 0.014 8.235 3.416 ...
##   ..$ UCB95Pct     : num [1:21] 5.9522 0.7298 0.0232 81.4556 6.3411 ...

Plots

Here I have plotted the analyses from above. The categorical variables are plotted as dots, maps and bars; the continuous variables are plotted as a distribution function. They are also grouped by QAPP parameter categories in the “Grouped plots” tab.

Category dot plot

plot<-ggplot(table,aes(x=Category,y=Estimate.P)) +
  geom_point()+
  geom_errorbar(aes(ymin=LCB95Pct.P,ymax=UCB95Pct.P),width=0.2)+
  theme(legend.position = "none")+
  facet_wrap(.~Indicator,scales = "free")+
  ylim(0,100)+
  labs(title="Condition estimates across NYS Ponded Waters",y="Percent of Total",x="Condition category")+
  theme(axis.text.x = element_text(angle = 45,vjust = 1, hjust=1))

plot <- set_panel_size(p = plot, width = unit(4, "cm"), height = unit(4,"cm"))

gridExtra::grid.arrange(plot)

Cumulative distribution function

#To plot a cumulative distribution function:
plot <- ggplot(analysis$CDF,aes(x=Value,y=Estimate.P,ymin=LCB95Pct.P,ymax=UCB95Pct.P))+
  geom_line()+
  geom_point()+
  geom_ribbon(alpha=0.5)+
  ylim(0,100)+
  facet_wrap(.~Indicator, scales="free")+
  guides(fill=FALSE,shape=FALSE)+
  labs(y="Percent of lakes")

plot <- set_panel_size(p = plot, width = unit(4, "cm"), height = unit(4,"cm"))

gridExtra::grid.arrange(plot)

Map

#fetch map
states <- map_data("state")
ny <- subset(states, region %in% c("new york"))

att$CHLA_threshold <- factor(att$CHLA_threshold, levels=c("Good","Fair","Poor"))
ggplot() + 
  geom_polygon(data=ny,aes(x = long, y = lat, group = group), color = "white") + 
  coord_fixed(1.3) +
  geom_point(data=att,aes(x=xcoord,y=ycoord,color=CHLA_threshold))+
  guides(fill=FALSE)+
  theme(panel.border = element_blank(), panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"))

Grouped plots

Categorical

#list variables you are interested in and defined above
trophic <- c("phos_trophic","chla_trophic","N_LIMIT","secchi","leech","color")
minerals <- c("zebra","alkalinity")
in.situ <- c("ph","conductance")
habs <- c("microcystin","chlorophyte","cryptophyta","cyanobacteria","dinophyta")
fishing <- c("AMMONIA_excursions","DO_excursions","NITRITE_excursions","PH_excursions","Fully_support")

vars.list <- list(Trophic=trophic,Minerals=minerals,`In Situ`=in.situ,HABs=habs,`Fishing use`=fishing) #name groups for labeling in plots

n<-0

for(i in vars.list){ #loop to plot each group as 1 facet wrapped image
  
  n<-n+1
  
  CatExtent <- cat_analysis(
    dframe=att,
    vars=i, 
    subpops = , 
    siteID = "siteID",
    weight = "WgtAdj", 
    xcoord = "xcoord",
    ycoord = "ycoord")
  
  table <- CatExtent %>% 
    select(Indicator,
           Category,
           Estimate.P,
           LCB95Pct.P,
           UCB95Pct.P) %>% 
    filter(Category!="Total") %>% 
    mutate(Category=case_when(
      Category==0  ~ "Non-excursion", #for fishing use
      Category==1 ~ "Excursion",
      Category=TRUE~Category
    )) %>% 
    mutate(Category=factor(Category, levels=c("Poor","Fair","Good", #factor for plots
                                              "Blue","Green","Brown","Murky",
                                              "High","Weak","Uncolored",
                                              "Medium","Low",
                                              "Acidic","~Neutral","Slightly alk","Highly alk",
                                              "Soft","Moderately hard","Average","Hard","Very hard",
                                              "Highly susceptible","May be susceptible","Not susceptible",
                                              "Eutrophic","Mesotrophic","Oligotrophic",
                                              "N-limited","Co-limited","P-limited",
                                              "Non-detect","Microcystin detected","Most disturbed",
                                              "Excursion","Non-excursion")),
           Indicator=case_when( #rename for plots
             Indicator=="phos_trophic"~"Phosphorus", 
             Indicator=="chla_trophic"~"Chlorophyll-a", 
             Indicator=="TP_threshold"~"Total phosphorus", 
             Indicator=="TN_threshold"~"Total nitrogen", 
             Indicator=="CHLA_threshold"~"Total chlorophyll", 
             Indicator=="microcystin"~"Microcystin",
             Indicator=="d.oxygen"~"Dissolved oxygen",
             Indicator=="N_LIMIT"~"Nutrient limitation", 
             Indicator=="secchi"~"Secchi", 
             Indicator=="zebra"~"Zebra mussel susceptibility", 
             Indicator=="conductance"~"Conductance", 
             Indicator=="color"~"Color", 
             Indicator=="ph"~"pH", 
             Indicator=="leech"~"Nutrient-color status", 
             Indicator=='alkalinity'~"Alkalinity",
             Indicator=="chlorophyte"~"% Chlorophyte",
             Indicator=="cryptophyta"~"% Cryptophyta",
             Indicator=="cyanobacteria"~"% Cyanobacteria",
             Indicator=="dinophyta"~"% Dinophyta",
             Indicator=="AMMONIA_excursions"~"Ammonia",
             Indicator=="DO_excursions"~"Dissolved oxygen",
             Indicator=="NITRITE_excursions"~"Nitrite",
             Indicator=="PH_excursions"~"pH",
             Indicator=="Fully_support"~"All parameters",
             Indicator=TRUE~Indicator)
    )
  
  if(names(vars.list)[n]=="Fishing use"){ #plot fishing use differently
    plot1<-ggplot(table[table$Indicator=="All parameters",],aes(x=Category,y=Estimate.P,fill=Category)) + #all parameter normal bar plot
      geom_col()+
      geom_errorbar(aes(ymin=LCB95Pct.P,ymax=UCB95Pct.P),width=0.2)+
      theme(legend.position = "none")+
      ylim(0,100)+
      labs(y="Percent of Total",x="Fully supporting",title="Fishing Use Parameters")+
      theme(axis.text.x = element_text(angle = 45,vjust = 1, hjust=1))+
      scale_fill_grey()
    
    plot2<-ggplot(table[table$Indicator!="All parameters",],aes(x=Indicator,y=Estimate.P,fill=Category)) + #stacked plot for individual parameters
      geom_bar(position="stack", stat="identity")+
      ylim(0,100)+
      labs(y="",x="Parameter",title=" ")+
      theme(axis.text.x = element_text(angle = 45,vjust = 1, hjust=1))+
      scale_fill_grey()
    
    plot1 <- set_panel_size(p = plot1, width = unit(4, "cm"), height = unit(4,"cm"))
    plot2 <- set_panel_size(p = plot2, width = unit(6, "cm"), height = unit(4,"cm"))
    
    gridExtra::grid.arrange(plot1,plot2,ncol=2)
  }else{ #all other groups are plotted here
    plot<-ggplot(table,aes(x=Category,y=Estimate.P)) +
      geom_col()+
      geom_errorbar(aes(ymin=LCB95Pct.P,ymax=UCB95Pct.P),width=0.2)+
      theme(legend.position = "none")+
      facet_wrap(.~Indicator,scales = "free")+
      ylim(0,100)+
      labs(y="Percent of Total",x="Condition category",title=paste(names(vars.list)[n],"Parameters"))+
      theme(axis.text.x = element_text(angle = 45,vjust = 1, hjust=1))
    
    plot <- set_panel_size(p = plot, width = unit(4, "cm"), height = unit(4,"cm"))
    
    gridExtra::grid.arrange(plot)
  }
  
  
  
}

Continuous

trophic <- c("CHLOROPHYLL_ug_L",
             "NITROGEN_mg_L",
             "PHOSPHORUS_mg_L",
             "SECCHI_m",
             "TRUE_COLOR",
             "NP_ratio")
minerals <- c("CALCIUM_mg_L","ALKALINITY_mg_L")
salt <- c("CHLORIDE_mg_L")
in.situ <- c("DISSOLVED_OXYGEN_mg_L",
             "PH_hypo",
             "SPEC_CONDUCTANCE")
metals <- c("ARSENIC_BS_TOTAL",
            "ARSENIC_OW_TOTAL",
            "IRON_BS_TOTAL",
            "IRON_OW_TOTAL",
            "MAGNESIUM_BS_TOTAL",
            "MAGNESIUM_OW_TOTAL")
habs <- c("CYANOBACTERIA")

vars.list <- list(Trophic=trophic,Minerals=minerals,Salt=salt,`In Situ`=in.situ,Metals=metals,HABs=habs) #named for plotting

n<-0

for(i in vars.list){ #loop to plot each group as 1 facet wrapped image
  
  n<-n+1
  
  #Creates 3 estimations in list: CDF, percentiles, means.
  analysis <- cont_analysis(
    dframe = att,
    vars = i,
    subpops = ,
    siteID = "siteID",
    weight = "WgtAdj",
    xcoord = "xcoord",
    ycoord = "ycoord")
  
  table<-analysis$CDF %>% 
    select(Indicator,Value,Estimate.P,LCB95Pct.P,UCB95Pct.P) %>% 
    filter(Indicator %in% i) %>% 
    mutate(Indicator=case_when( #rename for plots
      Indicator=="CHLOROPHYLL_ug_L"~"Chlorophyll-a (ug/L)",
      Indicator=="NITROGEN_mg_L"~"Total nitrogen (mg/L)",
      Indicator=="PHOSPHORUS_mg_L"~"Total phosphorus (mg/L)",
      Indicator=="DISSOLVED_OXYGEN_mg_L"~"Dissolved oxygen (mg/L)",
      Indicator=="ALKALINITY_mg_L"~"Alkalinity (mg/L)",
      Indicator=="CHLORIDE_mg_L"~"Chloride (mg/L)",
      Indicator=="CALCIUM_mg_L"~"Calcium (mg/L)",
      Indicator=="SECCHI_m"~"Secchi disk depth (m)",
      Indicator=="TRUE_COLOR"~"True color (PCU)",
      Indicator=="SPEC_CONDUCTANCE"~"Conductance (mS/cm)",
      Indicator=="PH_hypo"~"pH (SU)",
      Indicator=="ARSENIC_BS_TOTAL"~"Arsenic, bottom (ug/L)",
      Indicator=="ARSENIC_OW_TOTAL"~"Arsenic, open water (ug/L)",
      Indicator=="IRON_BS_TOTAL"~"Iron, bottom (ug/L)",
      Indicator=="IRON_OW_TOTAL"~"Iron, open water (ug/L)",
      Indicator=="MAGNESIUM_BS_TOTAL"~"Magnesium, bottom (ug/L)",
      Indicator=="MAGNESIUM_OW_TOTAL"~"Magnesium, open water (ug/L)",
      Indicator=="NP_ratio"~"DIN:TP ratio",
      Indicator=="CYANOBACTERIA"~"% Cyanobacteria",
      Indicator=TRUE~Indicator
    )) %>% 
    mutate(Lower=case_when( #defining lower thresholds for plots
      Indicator=="Chlorophyll-a (ug/L)"~2,
      Indicator=="Total phosphorus (mg/L)"~0.01,
      Indicator=="Specific conductance (mS/cm)"~125,
      Indicator=="Alkalinity (mg/L)"~60,
      Indicator=="Calcium (mg/L)"~10,
      Indicator=="True color (PCU)"~10,
      Indicator=="Secchi disk depth (m)"~2,
      Indicator=TRUE~as.numeric(""))) %>% 
    mutate(Upper=case_when( #defining upper thresholds for plots
      Indicator=="Chlorophyll-a (ug/L)"~8,
      Indicator=="Total phosphorus (mg/L)"~0.02,
      Indicator=="Specific conductance (mS/cm)"~250,
      Indicator=="Alkalinity (mg/L)"~120,
      Indicator=="Calcium (mg/L)"~20,
      Indicator=="True color (PCU)"~20,
      Indicator=="Secchi disk depth (m)"~5,
      Indicator=TRUE~as.numeric("")))
  
  plot<-ggplot(table,aes(x=Value,y=Estimate.P,ymin=LCB95Pct.P,ymax=UCB95Pct.P))+
    geom_line()+
    geom_point()+
    geom_ribbon(alpha=0.5)+
    ylim(0,100)+
    guides(fill="none",shape="none")+
    facet_wrap(.~Indicator, scales="free")+
    geom_vline(aes(xintercept=Lower),linetype="dashed")+
    geom_vline(aes(xintercept=Upper),linetype="dashed")+
    labs(y="Percent of lakes",title=paste(names(vars.list[n]),"Parameters"))+
    scale_color_grey()
  
  plot <- set_panel_size(p = plot, width = unit(4, "cm"), height = unit(4,"cm"))
  
  gridExtra::grid.arrange(plot)
  
}

Comparisons

Here I make the comparisons to NLA and NH data from above, as well as to historical LMAS sampling. Thresholds for NLA, NH and NY comparisons are from the NLA and are “good”, “fair”, and “poor” for chlorophyll-a, phosphorus, nitrogen and epilimnetic dissolved oxygen. There are no thresholds for chloride.

The values for LMAS sampling are derived from a separate script that counts the number of lakes in each size class from samples in the last 10 years.

Category dot plot

## During execution of the program, 3 warning messages were generated.  The warning 
## messages are stored in a data frame named 'warn_df'.  Enter the following 
## command to view the warning messages: warnprnt() 
## To view a subset of the warning messages (say, messages number 1, 3, and 5), 
## enter the following command: warnprnt(m=c(1,3,5))
## During execution of the program, a warning message was generated.  The warning 
## message is stored in a data frame named 'warn_df'.  Enter the following command 
## to view the warning message: warnprnt()
## During execution of the program, 10 warning messages were generated.  The warning 
## messages are stored in a data frame named 'warn_df'.  Enter the following 
## command to view the warning messages: warnprnt() 
## To view a subset of the warning messages (say, messages number 1, 3, and 5), 
## enter the following command: warnprnt(m=c(1,3,5))
plot <- ggplot(cats,aes(x=Category,y=Estimate.P,color=Study,shape=Study))+
  geom_point(position=position_dodge(width=0.2))+
  ylim(0,100)+
  geom_errorbar(aes(ymin=LCB95Pct.P, ymax=UCB95Pct.P), width=.25, position=position_dodge(width=0.2))+
  facet_wrap(.~Indicator, scales="free")+
  labs(title="Condition category extent estimates")

plot <- set_panel_size(p = plot, width = unit(4, "cm"), height = unit(4,"cm"))

gridExtra::grid.arrange(plot)

Cumulative distribution function

## During execution of the program, a warning message was generated.  The warning 
## message is stored in a data frame named 'warn_df'.  Enter the following command 
## to view the warning message: warnprnt()
## During execution of the program, a warning message was generated.  The warning 
## message is stored in a data frame named 'warn_df'.  Enter the following command 
## to view the warning message: warnprnt()
#list variables you are interested in and defined above
trophic <- c("CHLOROPHYLL_ug_L",
             "NITROGEN_mg_L",
             "PHOSPHORUS_ug_L",
             "SECCHI_m",
             "TRUE_COLOR",
             "NP_ratio")
minerals <- c("CALCIUM_mg_L","ALKALINITY_mg_L")
salt <- c("CHLORIDE_mg_L")
in.situ <- c("DISSOLVED_OXYGEN_mg_L",
             "PH_hypo",
             "SPEC_CONDUCTANCE")
metals <- c("ARSENIC_BS_TOTAL",
            "ARSENIC_OW_TOTAL",
            "IRON_BS_TOTAL",
            "IRON_OW_TOTAL",
            "MAGNESIUM_BS_TOTAL",
            "MAGNESIUM_OW_TOTAL")

vars.list <- list(Trophic=trophic,Minerals=minerals,Salt=salt,`In Situ`=in.situ,Metals=metals)

n<-0

for(i in vars.list){ #loop to plot each group as 1 facet wrapped image
  
  n<-n+1
  
  table<-all %>% 
    filter(Indicator %in% i) %>% 
    mutate(Indicator=case_when(
      Indicator=="CHLOROPHYLL_ug_L"~"Chlorophyll-a (ug/L)",
      Indicator=="NITROGEN_mg_L"~"Total nitrogen (mg/L)",
      Indicator=="PHOSPHORUS_ug_L"~"Total phosphorus (ug/L)",
      Indicator=="DISSOLVED_OXYGEN_mg_L"~"Dissolved oxygen (mg/L)",
      Indicator=="ALKALINITY_mg_L"~"Alkalinity (mg/L)",
      Indicator=="CHLORIDE_mg_L"~"Chloride (mg/L)",
      Indicator=="CALCIUM_mg_L"~"Calcium (mg/L)",
      Indicator=="SECCHI_m"~"Secchi disk depth (m)",
      Indicator=="TRUE_COLOR"~"True color (PCU)",
      Indicator=="SPEC_CONDUCTANCE"~"Conductance (mS/cm)",
      Indicator=="PH_hypo"~"pH (SU)",
      Indicator=="ARSENIC_BS_TOTAL"~"Arsenic, bottom (ug/L)",
      Indicator=="ARSENIC_OW_TOTAL"~"Arsenic, open water (ug/L)",
      Indicator=="IRON_BS_TOTAL"~"Iron, bottom (ug/L)",
      Indicator=="IRON_OW_TOTAL"~"Iron, open water (ug/L)",
      Indicator=="MAGNESIUM_BS_TOTAL"~"Magnesium, bottom (ug/L)",
      Indicator=="MAGNESIUM_OW_TOTAL"~"Magnesium, open water (ug/L)",
      Indicator=="DOC_mg_L"~"Dissolved organic carbon (mg/L)",
      Indicator=="NP_ratio"~"DIN:TP ratio",
      Indicator=TRUE~Indicator
    )) %>%
    filter((Indicator=="Chlorophyll-a (ug/L)"&Value<50)| #cutting off extreme values for certain variables
             (Indicator=="Chloride (mg/L)"&Value<250)|
             (Indicator=="Total nitrogen (mg/L)"&Value<2.5)|
             (Indicator=="Total phosphorus (ug/L)"&Value<150)|
             (Indicator=="Dissolved organic carbon (mg/L)"&Value<50)|
             (Indicator=="Calcium (mg/L)"&Value<50)|
             (Indicator=="Conductance (mS/cm)" & Value<1000)|
             (Indicator=="Magnesium, open water (ug/L)"&Value<25000)|
             (Indicator=="DIN:TP ratio"&Value<50)|
             !(Indicator %in% c("Chlorophyll-a (ug/L)","Chloride (mg/L)","Total nitrogen (mg/L)", "Total phosphorus (ug/L)","Dissolved organic carbon (mg/L)","Calcium (mg/L)","Conductance (mS/cm)","Magnesium, open water (ug/L)","DIN:TP ratio"))) %>% 
    mutate(Study=factor(Study, levels=c("NY","NLA","NAP","NH")))
  
  plot <- ggplot(table,aes(x=Value,y=Estimate.P,color=Study,shape=Study,linetype=Study))+
    geom_line()+
    ylim(0,100)+
    scale_linetype_manual(values=c("solid","dotdash", "dotted","dashed"))+
    facet_wrap(.~Indicator, scales="free")+
    labs(y="Percent of lakes",title=paste(names(vars.list[n]),"Parameters"))
  
  plot <- set_panel_size(p = plot, width = unit(4, "cm"), height = unit(4,"cm"))
  
  gridExtra::grid.arrange(plot)
  
}

Historical bias

The plot below shows the distribution of the size classes and trophic classes included in the data frame. For comparison, I’ve plotted the proportion of NYS lakes sampled in each size class since 2010.

#Plotting data frame distribution as well as NYS sample distribution collected since 2010
#LMAS numbers retrieved in 2020, script to do so is in Probability.Results.2021.rmd

frame<-att %>% 
  select(PROB_CAT,AREA_CAT) %>% 
  rename(size_category=AREA_CAT) %>% 
  distinct() %>%
  mutate(Probability_Sites = case_when( #statewide estimate from sampling design frame
    endsWith(size_category, "(1,4]") ~ ((1828/6437)*100),
    endsWith(size_category, "(4,10]") ~ ((2490/6437)*100),
    endsWith(size_category, "(10,20]") ~ ((1003/6437)*100),
    endsWith(size_category, "(20,50]") ~ ((616/6437)*100),
    endsWith(size_category, ">50") ~ ((500/6437)*100))) %>%
  mutate(LMAS_Sites = case_when(
    endsWith(size_category, "(1,4]") ~ (7.64),
    endsWith(size_category, "(4,10]") ~ (16.7),
    endsWith(size_category, "(10,20]") ~ (16.2),
    endsWith(size_category, "(20,50]") ~ (19.3),
    endsWith(size_category, ">50") ~ (40.1))) %>% 
  rename("Statewide"=Probability_Sites,
         "LMAS"=LMAS_Sites) %>% 
  ungroup() %>%
  select(-PROB_CAT) %>% 
  gather(study,percentage,-size_category) %>% 
  arrange(study,size_category) %>% 
  mutate(percentage=as.numeric(percentage),
         size_category=factor(size_category,levels=c("(1,4]","(4,10]","(10,20]","(20,50]",">50")))

plot <- ggplot(frame, aes(x=size_category,y=percentage,fill=study)) +
  geom_bar(stat = "identity", position = position_dodge())+
  labs(title="Size classes",y="Percent of Total",x="Size category (hectares)")+
  scale_fill_grey()+
  theme(legend.position = "none")

plot1 <- set_panel_size(p = plot, width = unit(6, "cm"), height = unit(4,"cm")) #save plot for later
forplot<-probability.cat %>% 
  mutate(study="Statewide") %>% 
  add_row(Indicator="Chlorophyll-a",
          Category="Eutrophic",
          study="LMAS",
          Estimate.P=((215/473)*100)) %>% 
  add_row(Indicator="Chlorophyll-a",
          Category="Mesotrophic",
          study="LMAS",
          Estimate.P=((182/473)*100)) %>% 
  add_row(Indicator="Chlorophyll-a",
          Category="Oligotrophic",
          study="LMAS",
          Estimate.P=((76/473)*100)) %>% 
  add_row(Indicator="Phosphorus",
          Category="Eutrophic",
          study="LMAS",
          Estimate.P=((211/473)*100)) %>% 
  add_row(Indicator="Phosphorus",
          Category="Mesotrophic",
          study="LMAS",
          Estimate.P=((141/473)*100)) %>% 
  add_row(Indicator="Phosphorus",
          Category="Oligotrophic",
          study="LMAS",
          Estimate.P=((121/473)*100)) %>% 
  add_row(Indicator="Secchi",
          Category="Eutrophic",
          study="LMAS",
          Estimate.P=((371/700)*100)) %>% 
  add_row(Indicator="Secchi",
          Category="Mesotrophic",
          study="LMAS",
          Estimate.P=((279/700)*100)) %>% 
  add_row(Indicator="Secchi",
          Category="Oligotrophic",
          study="LMAS",
          Estimate.P=((50/700)*100)) %>% 
  mutate(LCB95Pct.P=ifelse(is.na(LCB95Pct.P),Estimate.P,LCB95Pct.P),
         UCB95Pct.P=ifelse(is.na(UCB95Pct.P),Estimate.P,UCB95Pct.P))


plot <- ggplot(forplot, aes(x=Category,fill=study)) +
  geom_bar_pattern(stat = "identity", position = position_dodge(),
                   aes(y=Estimate.P,
                       pattern_type = study,
                       pattern_fill = study), 
                   pattern= 'magick',
                   fill= 'white',
                   colour= 'black',
                   pattern_scale = 0.5,
                   pattern_key_scale_factor = 1.1)+
  geom_errorbar(aes(ymin=LCB95Pct.P, ymax=UCB95Pct.P), width=0.2,
                position=position_dodge(.9))+
  facet_wrap(~Indicator,scales = "free")+
  ylim(0,100)+
  labs(title="Trophic Classes across NYS versus LMAS samples since 2010",y="Percent of Total",x="Trophic Classes")+
  theme(axis.text.x = element_text(angle = 45,vjust = 1, hjust=1),
        legend.key.height = unit(1, 'cm'),
        legend.key.width = unit(1, 'cm'))+
  scale_pattern_type_discrete(choices = c("gray15","bricks"))

# plot <- set_panel_size(p = plot, width = unit(4, "cm"), height = unit(4,"cm"))
# 
# gridExtra::grid.arrange(plot)
#plot for just phosphorus trophic bias
plot2 <- ggplot(forplot[forplot$Indicator=="Phosphorus",], aes(x=Category,fill=study,y=Estimate.P)) +
  geom_bar(stat = "identity", position = position_dodge())+
  geom_errorbar(aes(ymin=LCB95Pct.P, ymax=UCB95Pct.P), width=0.2,
                position=position_dodge(.9))+
  labs(title="Phosphorus",x="Trophic class",y="")+
  theme(axis.text.x = element_text(angle = 45,vjust = 1, hjust=1),
        legend.key.height = unit(1, 'cm'),
        legend.key.width = unit(1, 'cm'))+
  scale_fill_grey()

plot2 <- set_panel_size(p = plot2, width = unit(4, "cm"), height = unit(4,"cm"))

gridExtra::grid.arrange(plot1, plot2,ncol=2) #plot both side by side

Session info

sessionInfo(package=NULL)
## R version 4.1.2 (2021-11-01)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur 10.16
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] grid      stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] spsurvey_5.0.1  survey_4.1-1    survival_3.2-13 Matrix_1.3-4   
##  [5] sf_1.0-3        maps_3.4.0      ggmap_3.0.0     ggpattern_0.4.2
##  [9] egg_0.4.5       gridExtra_2.3   lubridate_1.8.0 huxtable_5.4.0 
## [13] forcats_0.5.1   stringr_1.4.0   dplyr_1.0.7     purrr_0.3.4    
## [17] readr_2.0.2     tidyr_1.1.4     tibble_3.1.6    ggplot2_3.3.5  
## [21] tidyverse_1.3.1
## 
## loaded via a namespace (and not attached):
##  [1] nlme_3.1-153        bitops_1.0-7        fs_1.5.0           
##  [4] httr_1.4.2          tools_4.1.2         backports_1.3.0    
##  [7] bslib_0.3.1         utf8_1.2.2          R6_2.5.1           
## [10] AlgDesign_1.2.0     KernSmooth_2.23-20  DBI_1.1.1          
## [13] colorspace_2.0-2    withr_2.4.2         sp_1.4-5           
## [16] tidyselect_1.1.1    compiler_4.1.2      cli_3.1.0          
## [19] rvest_1.0.2         xml2_1.3.2          labeling_0.4.2     
## [22] sass_0.4.0          scales_1.1.1        classInt_0.4-3     
## [25] proxy_0.4-26        commonmark_1.7      crossdes_1.1-1     
## [28] digest_0.6.28       minqa_1.2.4         rmarkdown_2.11     
## [31] jpeg_0.1-9          pkgconfig_2.0.3     htmltools_0.5.2    
## [34] lme4_1.1-27.1       highr_0.9           dbplyr_2.1.1       
## [37] fastmap_1.1.0       rlang_0.4.12        readxl_1.3.1       
## [40] rstudioapi_0.13     farver_2.1.0        jquerylib_0.1.4    
## [43] generics_0.1.1      jsonlite_1.7.2      gtools_3.9.2       
## [46] magrittr_2.0.1      Rcpp_1.0.7          munsell_0.5.0      
## [49] fansi_0.5.0         lifecycle_1.0.1     stringi_1.7.5      
## [52] yaml_2.2.1          MASS_7.3-54         plyr_1.8.6         
## [55] crayon_1.4.2        deldir_1.0-6        lattice_0.20-45    
## [58] haven_2.4.3         splines_4.1.2       hms_1.1.1          
## [61] knitr_1.36          pillar_1.6.4        boot_1.3-28        
## [64] rjson_0.2.20        reprex_2.0.1        glue_1.5.0         
## [67] evaluate_0.14       mitools_2.4         modelr_0.1.8       
## [70] nloptr_1.2.2.3      png_0.1-7           vctrs_0.3.8        
## [73] tzdb_0.2.0          RgoogleMaps_1.4.5.3 cellranger_1.1.0   
## [76] gtable_0.3.0        assertthat_0.2.1    xfun_0.28          
## [79] broom_0.7.10        e1071_1.7-9         class_7.3-19       
## [82] units_0.7-2         ellipsis_0.3.2

Time elapsed

end.time <- Sys.time()

elapsed.time <- round((end.time - start.time), 3)
print(elapsed.time)
## Time difference of 15.361 mins